import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import xgboost as xgb
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# embedding
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
# gnn
import torch
import torch_geometric
import torch.nn.functional as F
from torch_geometric.nn import GCNConv- fraudTrain
fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain| trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | city | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-01-01 00:00:00 | 2.703190e+15 | fraud_Rippin, Kub and Mann | misc_net | 4.97 | Jennifer | Banks | F | 561 Perry Cove | Moravian Falls | ... | 36.0788 | -81.1781 | 3495 | Psychologist, counselling | 1988-03-09 | 0b242abb623afc578575680df30655b9 | 1325376018 | 36.011293 | -82.048315 | 0 |
| 1 | 2019-01-01 00:00:00 | 6.304230e+11 | fraud_Heller, Gutmann and Zieme | grocery_pos | 107.23 | Stephanie | Gill | F | 43039 Riley Greens Suite 393 | Orient | ... | 48.8878 | -118.2105 | 149 | Special educational needs teacher | 1978-06-21 | 1f76529f8574734946361c461b024d99 | 1325376044 | 49.159047 | -118.186462 | 0 |
| 2 | 2019-01-01 00:00:00 | 3.885950e+13 | fraud_Lind-Buckridge | entertainment | 220.11 | Edward | Sanchez | M | 594 White Dale Suite 530 | Malad City | ... | 42.1808 | -112.2620 | 4154 | Nature conservation officer | 1962-01-19 | a1a22d70485983eac12b5b88dad1cf95 | 1325376051 | 43.150704 | -112.154481 | 0 |
| 3 | 2019-01-01 00:01:00 | 3.534090e+15 | fraud_Kutch, Hermiston and Farrell | gas_transport | 45.00 | Jeremy | White | M | 9443 Cynthia Court Apt. 038 | Boulder | ... | 46.2306 | -112.1138 | 1939 | Patent attorney | 1967-01-12 | 6b849c168bdad6f867558c3793159a81 | 1325376076 | 47.034331 | -112.561071 | 0 |
| 4 | 2019-01-01 00:03:00 | 3.755340e+14 | fraud_Keeling-Crist | misc_pos | 41.96 | Tyler | Garcia | M | 408 Bradley Rest | Doe Hill | ... | 38.4207 | -79.4629 | 99 | Dance movement psychotherapist | 1986-03-28 | a41d7549acf90789359a9aa5346dcb46 | 1325376186 | 38.674999 | -78.632459 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1048570 | 2020-03-10 16:07:00 | 6.011980e+15 | fraud_Fadel Inc | health_fitness | 77.00 | Haley | Wagner | F | 05561 Farrell Crescent | Annapolis | ... | 39.0305 | -76.5515 | 92106 | Accountant, chartered certified | 1943-05-28 | 45ecd198c65e81e597db22e8d2ef7361 | 1362931649 | 38.779464 | -76.317042 | 0 |
| 1048571 | 2020-03-10 16:07:00 | 4.839040e+15 | fraud_Cremin, Hamill and Reichel | misc_pos | 116.94 | Meredith | Campbell | F | 043 Hanson Turnpike | Hedrick | ... | 41.1826 | -92.3097 | 1583 | Geochemist | 1999-06-28 | c00ce51c6ebb7657474a77b9e0b51f34 | 1362931670 | 41.400318 | -92.726724 | 0 |
| 1048572 | 2020-03-10 16:08:00 | 5.718440e+11 | fraud_O'Connell, Botsford and Hand | home | 21.27 | Susan | Mills | F | 005 Cody Estates | Louisville | ... | 38.2507 | -85.7476 | 736284 | Engineering geologist | 1952-04-02 | 17c9dc8b2a6449ca2473726346e58e6c | 1362931711 | 37.293339 | -84.798122 | 0 |
| 1048573 | 2020-03-10 16:08:00 | 4.646850e+18 | fraud_Thompson-Gleason | health_fitness | 9.52 | Julia | Bell | F | 576 House Crossroad | West Sayville | ... | 40.7320 | -73.1000 | 4056 | Film/video editor | 1990-06-25 | 5ca650881b48a6a38754f841c23b77ab | 1362931718 | 39.773077 | -72.213209 | 0 |
| 1048574 | 2020-03-10 16:08:00 | 2.283740e+15 | fraud_Buckridge PLC | misc_pos | 6.81 | Shannon | Williams | F | 9345 Spencer Junctions Suite 183 | Alpharetta | ... | 34.0770 | -84.3033 | 165556 | Prison officer | 1997-12-27 | 8d0a575fe635bbde12f1a2bffc126731 | 1362931730 | 33.601468 | -83.891921 | 0 |
1048575 rows × 22 columns
- df02
_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02 = df02.reset_index()- df_toy
df_toy=df02[:5].copy()
df_toy.cc_num = pd.Series([1,1,1,2,2])
df_toy| index | trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 669418 | 2019-10-12 18:21:00 | 1 | fraud_Haley, Jewess and Bechtelar | shopping_pos | 7.53 | Debra | Stark | F | 686 Linda Rest | ... | 32.3836 | -94.8653 | 24536 | Multimedia programmer | 1983-10-14 | d313353fa30233e5fab5468e852d22fc | 1350066071 | 32.202008 | -94.371865 | 0 |
| 1 | 32567 | 2019-01-20 13:06:00 | 1 | fraud_Turner LLC | travel | 3.79 | Judith | Moss | F | 46297 Benjamin Plains Suite 703 | ... | 39.5370 | -83.4550 | 22305 | Television floor manager | 1939-03-09 | 88c65b4e1585934d578511e627fe3589 | 1327064760 | 39.156673 | -82.930503 | 0 |
| 2 | 156587 | 2019-03-24 18:09:00 | 1 | fraud_Klein Group | entertainment | 59.07 | Debbie | Payne | F | 204 Ashley Neck Apt. 169 | ... | 41.5224 | -71.9934 | 4720 | Broadcast presenter | 1977-05-18 | 3bd9ede04b5c093143d5e5292940b670 | 1332612553 | 41.657152 | -72.595751 | 0 |
| 3 | 1020243 | 2020-02-25 15:12:00 | 2 | fraud_Monahan-Morar | personal_care | 25.58 | Alan | Parsons | M | 0547 Russell Ford Suite 574 | ... | 39.6171 | -102.4776 | 207 | Network engineer | 1955-12-04 | 19e16ee7a01d229e750359098365e321 | 1361805120 | 39.080346 | -103.213452 | 0 |
| 4 | 116272 | 2019-03-06 23:19:00 | 2 | fraud_Kozey-Kuhlman | personal_care | 84.96 | Jill | Flores | F | 639 Cruz Islands | ... | 41.9488 | -86.4913 | 3104 | Horticulturist, commercial | 1981-03-29 | a0c8641ca1f5d6e243ed5a2246e66176 | 1331075954 | 42.502065 | -86.732664 | 0 |
5 rows × 23 columns
- df_toy 에서 time_difference 구함
고객1
df_toy.iloc[0].trans_date_trans_time.value - df_toy.iloc[1].trans_date_trans_time.value22914900000000000
df_toy.iloc[0].trans_date_trans_time.value - df_toy.iloc[2].trans_date_trans_time.value17453520000000000
df_toy.iloc[1].trans_date_trans_time.value - df_toy.iloc[2].trans_date_trans_time.value-5461380000000000
고객2
df_toy.iloc[3].trans_date_trans_time.value - df_toy.iloc[4].trans_date_trans_time.value30729180000000000
고객1,2
def compute_time_difference(group):
n = len(group)
result = []
for i in range(n):
for j in range(n):
time_difference = abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
return resultgroups = df_toy.groupby('cc_num')
edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
edge_index_list_plus_nparrarray([[ 0, 0, 0],
[ 0, 1, 22914900000000000],
[ 0, 2, 17453520000000000],
[ 1, 0, 22914900000000000],
[ 1, 1, 0],
[ 1, 2, 5461380000000000],
[ 2, 0, 17453520000000000],
[ 2, 1, 5461380000000000],
[ 2, 2, 0],
[ 3, 3, 0],
[ 3, 4, 30729180000000000],
[ 4, 3, 30729180000000000],
[ 4, 4, 0]])
- df02에서 time_difference 구함
# t1 = time.time()
# groups = df02.groupby('cc_num')
# edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
# edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
# edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
# np.save('edge_index_list_plus02.npy', edge_index_list_plus_nparr)
# t2 = time.time()
# t2-t1groups = df02.groupby("cc_num")edge_index_list_plus02[:,2] = (np.exp(-edge_index_list_plus02[:,2]/theta) != 1)*(np.exp(-edge_index_list_plus02[:,2]/theta))edge_index_list_plus02array([[ 2881, 2881, 0],
[ 2881, 3061, 0],
[ 2881, 4867, 0],
...,
[212771, 212765, 0],
[212771, 212769, 0],
[212771, 212771, 0]])
theta = edge_index_list_plus02[:,2].mean()edge_index_list_plus02 = np.load('edge_index_list_plus02.npy').astype(np.float64)edge_index_list_plus02[:,2] = (np.exp(-edge_index_list_plus02[:,2]/theta) != 1)*(np.exp(-edge_index_list_plus02[:,2]/theta))edge_index_list_plus02array([[2.88100000e+03, 2.88100000e+03, 0.00000000e+00],
[2.88100000e+03, 3.06100000e+03, 1.96061280e-01],
[2.88100000e+03, 4.86700000e+03, 8.12918172e-01],
...,
[2.12771000e+05, 2.12765000e+05, 9.97708695e-01],
[2.12771000e+05, 2.12769000e+05, 9.99923197e-01],
[2.12771000e+05, 2.12771000e+05, 0.00000000e+00]])
같은 cc_num별로.. 시간 차이를 계산했어.
weight = (np.exp(-edge_index_list_plus02[:,2]/theta) != 1)*(np.exp(-edge_index_list_plus02[:,2]/theta))weightarray([0. , 0.19606128, 0.81291817, ..., 0.9977087 , 0.9999232 ,
0. ])
자꾸 아래처럼 하면 3열의 값이 다 0나와가지고;; 이상하게 해봄
edge_index_list_plus02[:,2] = (np.exp(-edge_index_list_plus02[:,2]/theta) != 1)*(np.exp(-edge_index_list_plus02[:,2]/theta))edge_index_list_plus02 = np.column_stack((edge_index_list_plus02, weight))edge_index_list_plus02array([[2.88100000e+03, 2.88100000e+03, 0.00000000e+00, 0.00000000e+00],
[2.88100000e+03, 3.06100000e+03, 1.90922400e+16, 1.96061280e-01],
[2.88100000e+03, 4.86700000e+03, 2.42706000e+15, 8.12918172e-01],
...,
[2.12771000e+05, 2.12765000e+05, 2.68800000e+13, 9.97708695e-01],
[2.12771000e+05, 2.12769000e+05, 9.00000000e+11, 9.99923197e-01],
[2.12771000e+05, 2.12771000e+05, 0.00000000e+00, 0.00000000e+00]])
edge_index_list_plus02 = np.delete(edge_index_list_plus02, 2, axis=1)edge_index_list_plus02array([[2.88100000e+03, 2.88100000e+03, 0.00000000e+00],
[2.88100000e+03, 3.06100000e+03, 1.96061280e-01],
[2.88100000e+03, 4.86700000e+03, 8.12918172e-01],
...,
[2.12771000e+05, 2.12765000e+05, 9.97708695e-01],
[2.12771000e+05, 2.12769000e+05, 9.99923197e-01],
[2.12771000e+05, 2.12771000e+05, 0.00000000e+00]])
edge_index_list_plus02.shape(65831594, 3)
edge_index_list_updated = edge_index_list_plus02.tolist()np.array(edge_index_list_updated)[:,2].mean()0.4536043999922591
mm = np.array(edge_index_list_updated)[:,2].mean()selected_edges = [(int(row[0]), int(row[1])) for row in edge_index_list_updated if row[2] > mm]
edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()
edge_index_selected.shapetorch.Size([2, 29970380])
tr/test
df02_tr,df02_test = sklearn.model_selection.train_test_split(df02, random_state=42)df02_tr.shape, df02_test.shape((160890, 23), (53630, 23))
N = len(df02)
train_mask = [i in df02_tr.index for i in range(N)]
test_mask = [i in df02_test.index for i in range(N)]train_mask = np.array(train_mask)
test_mask = np.array(test_mask)train_mask.shape, test_mask.shape((214520,), (214520,))
data
x = torch.tensor(df02['amt'], dtype=torch.float).reshape(-1,1)
y = torch.tensor(df02['is_fraud'],dtype=torch.int64)
data = torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y, train_mask = train_mask, test_mask = test_mask)dataData(x=[214520, 1], edge_index=[2, 29970380], y=[214520], train_mask=[214520], test_mask=[214520])
class GCN(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(1, 16)
self.conv2 = GCNConv(16,2)
def forward(self, data):
x, edge_index = data.x, data.edge_index
x = self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
return F.log_softmax(x, dim=1)model = GCN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()
for epoch in range(400):
optimizer.zero_grad()
out = model(data)
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(np.array(data.test_mask).sum())
print(f'Accuracy: {acc:.4f}')Accuracy: 0.9707
predicted_labels = pred[data.test_mask]
true_labels = data.y[data.test_mask]precision = precision_score(true_labels, predicted_labels, average='macro')
recall = recall_score(true_labels, predicted_labels, average='macro')
f1 = f1_score(true_labels, predicted_labels, average='macro')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')Precision: 0.4854
Recall: 0.5000
F1 Score: 0.4926
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))